POC modeled VS POC observed

Investigate whether modeled POC is representative of POC observations.

Author

Thelma Panaïotis

source("utils.R")

POC observation data / sediment traps

Read data

Read data, keep columns of interest, rename, drop observations where poc is missing, and keep only sediment traps.

df_trap_raw <- read_delim("data/raw/GO_flux.tab", delim = "\t", escape_double = FALSE, trim_ws = TRUE, skip = 87, show_col_types = FALSE)
# Select columns of interest and rename
df_trap <- df_trap_raw %>%
  select(
    id_ref = `ID (Reference identifier)`,
    id_loc = `ID (Unique location identifier)`,
    type = `Type (Data type)`,
    lon = Longitude,
    lat = Latitude,
    depth = `Depth water [m] (Sediment trap deployment depth)`,
    start_date = `Date/Time (Deployed)`,
    end_date = `Date/time end (Retrieved)`,
    duration = `Duration [days]`,
    poc = `POC flux [mg/m**2/day]`
  ) %>%
  # drop observations where poc is missing
  drop_na(poc) %>% 
  # only sediment traps
  filter(type == "sediment_trap")

Distribution in space and time

Number of observations at each location.

df_trap %>%
  add_count(lon, lat) %>% 
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = n)) +
  #scale_colour_cmocean(name = "deep") +
  scale_colour_viridis_c(trans = "log10") +
  coord_quickmap(expand = 0)

Depth distribution

ggplot(df_trap) + geom_histogram(aes(x = depth), binwidth = 100)

Time distribution

ggplot(df_trap) + geom_histogram(aes(x = start_date))

ggplot(df_trap) + geom_histogram(aes(x = duration)) + scale_x_log10()

Data points around the 1000 m horizon

Let’s focus on the data around 1000 m depth. For a set of depth range between 50 and 500 m, compute number and map observations.

hor <- 1000
ranges <- lapply(seq(from = 100, to = 500, by = 50), function(W){
  d <- df_trap %>%
    filter(between(depth, hor - W, hor + W)) %>% # keep points within the depth range
    mutate(W = W)
  return(d)
})
ranges <- do.call(bind_rows, ranges)

ranges %>%
  count(W) %>%
  ggplot() +
  geom_col(aes(x = W, y = n)) +
  ggtitle("Number of data points within 1000 ± W depth range")

ranges %>%
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = depth)) +
  scale_colour_cmocean(name = "deep") +
  coord_quickmap(expand = 0) +
  facet_wrap(~W)

Fixed depth range

Let’s use a depth range of 200 meters. Look at the distribution of POC values (transformed and logged).

W <-  500
df_trap <- df_trap %>% 
  filter(between(depth, hor - W, hor + W)) %>% 
  mutate(poc_log = log(poc))
ggplot(df_trap) + geom_histogram(aes(x = poc))

ggplot(df_trap) + geom_histogram(aes(x = poc_log)) # yay

Log-transformed poc is closer to a normal distribution, but if we want to predict modeled poc from observed poc, depending on the prediction model, the distribution of the predictor may not be important.

Let’s now round coordinates to match with Wang et al., 2023 which uses a grid of 2°×2°.

# average by pixel
df_trap_pix <- df_trap %>%
  mutate(
    # floor longitude and add 1 because carbon longitudes are odd
    lon = roundp(lon, precision = 2, f = floor) + 1,
    # round latitude because carbon latitudes are even
    lat = roundp(lat, precision = 2, f = round)
  ) %>%
  group_by(lon, lat) %>%
  summarise(
    poc_mean = mean(poc),
    poc_sd = sd(poc)
  ) %>%
  ungroup()

df_trap_pix %>%
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = poc_mean)) +
  scale_colour_cmocean(name = "matter") +
  coord_quickmap(expand = 0)

df_trap_pix %>%
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = poc_sd)) +
  scale_colour_viridis_c() +
  coord_quickmap(expand = 0)

Let’s check the number of observations per pixel.

df_trap %>%
  mutate(
    # floor longitude and add 1 because carbon longitudes are odd
    lon = roundp(lon, precision = 2, f = floor) + 1,
    # round latitude because carbon latitudes are even
    lat = roundp(lat, precision = 2, f = round)
  ) %>% 
  count(lon, lat) %>% 
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = n)) +
  scale_colour_viridis_c(trans = "log10") +
  coord_quickmap(expand = 0)

Looks like we have 2 timeseries with > 300 observations.

What do we do with these timeseries?

Match with POC modeled data

Read modeled data

d_mat <- readMat("data/raw/Cexp_CAFE_kl24h.mat")$EXP[,,1]
poc_exp <- d_mat$POCexp # POC at base of euphotic layer (73 m)
poc_100 <- d_mat$POC100 # POC at 100 m
poc_1000 <- d_mat$POC1000 # POC at 1000 m

# Generate colnames as longitudes and rownames as latitudes
colnames(poc_exp) <- (c(0.5:179.5) * 2)
rownames(poc_exp) <- (c(0:90) * 2) - 90
colnames(poc_100) <- (c(0.5:179.5) * 2)
rownames(poc_100) <- (c(0:90) * 2) - 90
colnames(poc_1000) <- (c(0.5:179.5) * 2)
rownames(poc_1000) <- (c(0:90) * 2) - 90


## Convert to dataframe 
df_exp <- poc_exp %>%
  as.data.frame() %>%
  rownames_to_column(var = "lat") %>%
  as_tibble() %>%
  pivot_longer(cols = -lat, names_to = "lon", values_to = "poc_exp") %>%
  mutate(lat = as.numeric(lat), lon = as.numeric(lon)) %>%
  mutate(lon = ifelse(lon > 180, lon - 360, lon)) %>%
  mutate(poc_exp = ifelse(poc_exp == 0, NA, poc_exp)) %>%
  arrange(lon, lat)

df_100 <- poc_100 %>%
  as.data.frame() %>%
  rownames_to_column(var = "lat") %>%
  as_tibble() %>%
  pivot_longer(cols = -lat, names_to = "lon", values_to = "poc_100") %>%
  mutate(lat = as.numeric(lat), lon = as.numeric(lon)) %>%
  mutate(lon = ifelse(lon > 180, lon - 360, lon)) %>%
  mutate(poc_100 = ifelse(poc_100 == 0, NA, poc_100)) %>%
  arrange(lon, lat)

df_1000 <- poc_1000 %>%
  as.data.frame() %>%
  rownames_to_column(var = "lat") %>%
  as_tibble() %>%
  pivot_longer(cols = -lat, names_to = "lon", values_to = "poc_1000") %>%
  mutate(lat = as.numeric(lat), lon = as.numeric(lon)) %>%
  mutate(lon = ifelse(lon > 180, lon - 360, lon)) %>%
  mutate(poc_1000 = ifelse(poc_1000 == 0, NA, poc_1000)) %>%
  arrange(lon, lat)

# Join together
df_mod <- df_exp %>% left_join(df_100, by = join_by(lat, lon)) %>% left_join(df_1000, by = join_by(lat, lon))

# Convert from mmol C m-2 year-1 to mg C m-2 day-1 (divide by 365.25 and multiply by 12)
df_mod <- df_mod %>% 
  mutate(
    poc_exp = (poc_exp / 365.25)*12,
    poc_100 = (poc_100 / 365.25)*12,
    poc_1000 = (poc_1000 / 365.25)*12
  )

# Plot maps 
ggmap(df_mod, var = "poc_exp", type = "raster") +
  scale_fill_cmocean(name = "matter", na.value = NA) +
  geom_point(data = df_trap_pix, aes(x = lon, y = lat), size = 0.5)

ggmap(df_mod, var = "poc_100", type = "raster") +
  scale_fill_cmocean(name = "matter", na.value = NA) +
  geom_point(data = df_trap_pix, aes(x = lon, y = lat), size = 0.5)

ggmap(df_mod, var = "poc_1000", type = "raster") +
  scale_fill_cmocean(name = "matter", na.value = NA) +
  geom_point(data = df_trap_pix, aes(x = lon, y = lat), size = 0.5)

Join

Join by coordinates, drop observations where modeled poc is not available.

df_all <- df_trap_pix %>% 
  left_join(df_mod, by = join_by(lon, lat)) %>% 
  drop_na(poc_100, poc_1000)

Let’s plot a map of our final dataset.

ggplot(df_all) + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat)) +
  coord_quickmap(expand = 0)

Let’s also have a look at the data distribution.

ggplot(df_all) + geom_histogram(aes(x = poc_mean))

ggplot(df_all) + geom_histogram(aes(x = poc_1000))

Let’s still plot modeled VS observations, both in untransformed and log-transformed spaces.

ggplot(df_all) +
  geom_point(aes(x = poc_mean, y = poc_1000)) +
  geom_abline(intercept = 0, slope = 1, colour = "red")

ggplot(df_all) +
  geom_point(aes(x = poc_mean, y = poc_1000)) +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  scale_x_log10() + 
  scale_y_log10()

Let’s compute the correlation.

# Untransformed
cor(df_all$poc_mean, df_all$poc_1000)
[1] 0.1169209
# Log-transformed
cor(log(df_all$poc_mean), log(df_all$poc_1000))
[1] 0.2019889

Correlation is pretty bad. Will be hard to predict modeled value from observation values.

POC observation data / ARGO

library(raveio)
# Open mat file
globe <- read_mat("data/raw/BGC_Argo_monthly_maps_GLOBESINK_global_beta20240212auto_baseline_QC_only_fluxes.mat")


# Read POC data
# NB: these are mg C m-2 day-1, same as Wang
poc_l <- globe$`monthly_maps/flux/POC_l`
poc_s <- globe$`monthly_maps/flux/POC_s`

# Read coordinates
#  compute bin centres from edges
depths <- globe$zbin_edges[1,]
depths <- head(depths, -1) + (diff(depths) / 2)
lats <- globe$lat_edges[1,]
lats <- head(lats, -1) + (diff(lats))/2
lons <- globe$lon_edges[1,]
lons <- head(lons, -1) + (diff(lons))/2

# Reorder POC dimensions to lon, lat, depth, month
poc_l <- aperm(poc_l, c(3, 2, 1, 4))
poc_s <- aperm(poc_s, c(3, 2, 1, 4))

# Compute annual mean at all depth
poc_l <- apply(poc_l, c(1, 2, 3), mean, na.rm = TRUE)
poc_s <- apply(poc_s, c(1, 2, 3), mean, na.rm = TRUE)

## Convert to dataframe
# store matrices into a single object
poc <- c()
poc$poc_l <- poc_l
poc$poc_s <- poc_s

# unroll each matrix
poc_v <- lapply(poc, function(e) { as.vector(e) })
# combine as columns
df_argo <- do.call(cbind, poc_v) %>% as.data.frame() %>% setNames(names(poc_v))

# add coordinates (NB: shorter elements are recycled automatically)
df_argo$lon <- lons
df_argo$lat <- rep(lats, each = length(lons))
df_argo$depth <- rep(depths, each = length(lons) * length(lats))

# Reorder and compute total poc
df_argo <- df_argo %>% 
  mutate(poc_tot = poc_s + poc_l) %>% 
  select(lon, lat, depth, everything()) %>% 
  as_tibble()

Explore data, surface data.

df_argo_sl <- df_argo %>% filter(depth == 5)
ggmap(df_argo_sl, var = "poc_tot", type = "raster") + scale_fill_cmocean(name = "matter", na.value = NA)

Explore data at the base of euphotic layer, using depth bin 75.0.

See how these values compare to outputs from Wang. First, we need to round coordinates from Wang data to match output data that uses bins of:

  • 8° in longitude

  • 4° in latitude

t_depth <- 75 # target depth
df_argo_sl <- df_argo %>% filter(depth == t_depth)
ggmap(df_argo_sl, var = "poc_tot", type = "raster") + scale_fill_cmocean(name = "matter", na.value = NA)

df_join <- df_mod %>% 
  mutate(
    lon = roundp(lon, precision = 8, f = round),
    lat = roundp(lat, precision = 4, f = round),
  ) %>% 
  group_by(lon, lat) %>% 
  summarise(poc_exp = mean(poc_exp), .groups = "drop") %>% 
  left_join(df_argo_sl %>% filter(depth == t_depth), by = join_by(lon, lat)) %>% 
  select(-depth) %>% 
  rename(poc_mod = poc_exp) %>% 
  pivot_longer(poc_l:poc_tot, names_to = "size_range", values_to = "poc_obs")
  
ggplot(df_join) + 
  geom_abline(slope = 1, intercept = 0, colour = "red") +
  geom_point(aes(x = poc_obs, y = poc_mod, colour = abs(lat)), size = 0.5) + 
  scale_colour_viridis_c() +
  facet_wrap(~size_range, scales = "free", ncol = 3)

## Compute correlations per group
df_join %>% 
  select(size_range, poc_mod, poc_obs) %>% 
  nest(data = c(poc_mod, poc_obs)) %>% 
  mutate(
    corr = map(data, ~cor(.$poc_mod, .$poc_obs, use = "pairwise.complete.obs"))  
  ) %>% 
  unnest(corr)
# A tibble: 3 × 3
  size_range data                     corr
  <chr>      <list>                  <dbl>
1 poc_l      <tibble [2,025 × 2]> -0.0472 
2 poc_s      <tibble [2,025 × 2]>  0.342  
3 poc_tot    <tibble [2,025 × 2]>  0.00628

Explore data, using depth bin 937.5, the closest to 1000 m.

t_depth <- 937.5 # target depth
df_argo_sl <- df_argo %>% filter(depth == t_depth)
ggmap(df_argo_sl, var = "poc_tot", type = "raster") + scale_fill_cmocean(name = "matter", na.value = NA)

df_join <- df_mod %>% 
  mutate(
    lon = roundp(lon, precision = 8, f = round),
    lat = roundp(lat, precision = 4, f = round),
  ) %>% 
  group_by(lon, lat) %>% 
  summarise(poc_exp = mean(poc_exp), .groups = "drop") %>% 
  left_join(df_argo_sl %>% filter(depth == t_depth), by = join_by(lon, lat)) %>% 
  select(-depth) %>% 
  rename(poc_mod = poc_exp) %>% 
  pivot_longer(poc_l:poc_tot, names_to = "size_range", values_to = "poc_obs")
  
ggplot(df_join) + 
  geom_abline(slope = 1, intercept = 0, colour = "red") +
  geom_point(aes(x = poc_obs, y = poc_mod, colour = abs(lat)), size = 0.5) + 
  scale_colour_viridis_c() +
  facet_wrap(~size_range, scales = "free", ncol = 3)

## Compute correlations per group
df_join %>% 
  select(size_range, poc_mod, poc_obs) %>% 
  nest(data = c(poc_mod, poc_obs)) %>% 
  mutate(
    corr = map(data, ~cor(.$poc_mod, .$poc_obs, use = "pairwise.complete.obs"))  
  ) %>% 
  unnest(corr)
# A tibble: 3 × 3
  size_range data                  corr
  <chr>      <list>               <dbl>
1 poc_l      <tibble [2,025 × 2]> 0.185
2 poc_s      <tibble [2,025 × 2]> 0.131
3 poc_tot    <tibble [2,025 × 2]> 0.198

POC attenuation

Between ~100 and ~1000 m.

Wang

df_mod <- df_mod %>% mutate(att = poc_1000 / poc_100)
ggmap(df_mod, var = "att", type = "raster") + labs(fill = "POC\natten.")

ARGO

df_argo_att <- df_argo %>% 
  select(lon, lat, depth, poc_tot) %>% 
  filter(depth == 175 | depth == 475) %>% 
  mutate(layer = ifelse(depth == 175, "shallow", "deep")) %>% 
  select(-depth) %>% 
  pivot_wider(names_from = "layer", values_from = "poc_tot") %>% 
  mutate(att = deep / shallow) 

ggmap(df_argo_att, var = "att", type = "raster")

# Attenuation should be < 1
ggmap(df_argo_att %>% filter(att <= 1), var = "att", type = "raster")

b-value for Fz = Fz0(z/z0)b

# Reference depth 75 m
z0 <- 75

# Unique identifier for pixels
df_argo <- df_argo %>% 
  select(lon, lat, depth, poc_tot) %>% 
  mutate(pix_id = paste0("lon_", lon, "_lat_", lat), .before = lon) %>% 
  filter(depth >= z0)

# Drop pixels with no data
df_argo <- df_argo %>% 
  group_by(pix_id, lon, lat) %>% 
  mutate(n_obs = sum(!is.na(poc_tot))) %>% 
  ungroup() %>% 
  filter(n_obs > 0)

# Select a few pixels for plots
set.seed(1)
pix_plot <- df_argo %>% select(pix_id) %>% unique() %>% slice_sample(n = 16) %>% pull(pix_id)

df_argo %>% 
  filter(pix_id %in% pix_plot) %>% 
  #filter(lon == -128) %>% 
  ggplot() + 
  geom_point(aes(x = poc_tot, y = -depth)) +
  facet_wrap(~pix_id)

toto <- df_argo %>% 
  filter(pix_id %in% pix_plot) %>% 
  filter(lon == -128) %>% 
  group_by(pix_id, lon, lat) %>% 
  arrange(depth) %>% 
  ungroup()

ggplot(toto) +
  geom_point(aes(x = poc_tot, y = -depth)) +
  facet_wrap(~pix_id)

ggplot(toto) +
  geom_point(aes(x = log(depth/z0), y = log(poc_tot))) +
  facet_wrap(~pix_id)

# Linear model
att_fit <- lm(log(poc_tot) ~ log(depth/z0), data = toto)
summary(att_fit)

Call:
lm(formula = log(poc_tot) ~ log(depth/z0), data = toto)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.34606 -0.17043 -0.03345  0.11832  0.73952 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)    4.90355    0.13785   35.57  < 2e-16 ***
log(depth/z0) -1.00436    0.06357  -15.80 1.72e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2526 on 22 degrees of freedom
Multiple R-squared:  0.919, Adjusted R-squared:  0.9153 
F-statistic: 249.6 on 1 and 22 DF,  p-value: 1.72e-13
tidy(att_fit)
# A tibble: 2 × 5
  term          estimate std.error statistic  p.value
  <chr>            <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)       4.90    0.138       35.6 6.15e-21
2 log(depth/z0)    -1.00    0.0636     -15.8 1.72e-13